Initial Mini Example

Ramsey County: Percent White

# packages and data

library(tigris)
library(sf)
library(tidyverse)
library(randomcoloR)
library(leaflet)

# Ramsey county block groups

bg <- block_groups(state = "MN", county = "Ramsey") %>%
  st_transform(26915)

# True census tracts

tract <- tracts(state = "MN", county = "Ramsey") %>%
  st_transform(26915)

# Jolly tracts

aggregate <- st_read("code/output/prototype.shp") %>%
  st_transform(26915)

As an initial picture, there are 401 block groups in Ramsey county.

leaflet(bg %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = "#faf7cf",
              weight = 2,
              opacity = 0.8,
              color = "#b0b5bf",
              fillOpacity = 0.6)

And 137 census tracts.

leaflet(tract %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = "#e89f9b",
              weight = 2,
              opacity = 0.8,
              color = "#b0b5bf",
              fillOpacity = 0.6)

As an example about how to aggregate the block to groups to a larger geography to see a decrease in the margin of error I’ll use percent white as the target variable. The spatial distribution of the total white population is mapped below.

percent_white <- read_csv("code/example_data/white_prop_est_ramsey.csv") %>% left_join(read_csv("code/example_data/race_moe_est_ramsey.csv"))

percent_white_bg <- bg %>% left_join(percent_white %>% mutate(id = as.character(id)), by = c("GEOID" = "id"))

pal <- colorQuantile("RdPu", domain = percent_white_bg$estimate_not_hispanic_or_latino_white_alone, n = 5)

leaflet(percent_white_bg %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~pal(estimate_not_hispanic_or_latino_white_alone),
  weight = 2,
  opacity = 1,
  color = "white",
  fillOpacity = 0.7) %>%
  addLegend(position = "bottomright",
            pal = pal,
            values = ~estimate_not_hispanic_or_latino_white_alone,
            title = "Total white population")

But we know that some block groups have high margins of error. Mapped below are the coefficients of variation (cv), or the percent of the estimate covered by the margin of error.

percent_white_bg <- percent_white_bg %>%
  mutate(cv = if_else(estimate_not_hispanic_or_latino_white_alone == 0, NA_real_, (margin_of_error_not_hispanic_or_latino_white_alone / 1.645) / estimate_not_hispanic_or_latino_white_alone))

pal <- colorQuantile("PuBu", domain = percent_white_bg$cv, na.color = "#edf0f2", n = 5)

leaflet(percent_white_bg %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~pal(cv),
  weight = 2,
  opacity = 1,
  color = "white",
  fillOpacity = 0.7) %>%
  addLegend(position = "bottomright",
            pal = pal,
            values = ~cv,
            title = "Total white population coefficient of variation")

We can see below the most common cv is about 0.12, or 12% of the estimate for a particular block group. The general guideline is that estimates with a cv less than 0.12 are “highly reliable.” Only 30% of our block groups meet that threshold.

#sum(percent_white_bg$cv > 0.12, na.rm = TRUE) /nrow(percent_white_bg)

ggplot(percent_white_bg, aes(x = cv)) +
  geom_histogram(fill = "#688da6", color = "white") +
  scale_x_continuous(breaks = seq(0, 1, .1)) +
  theme_minimal() +
  labs(y = "Block groups", x = "Coefficient of variation (total white population)")

Instead, we can aggregate the block groups to some other geography that makes sense contextually. That context changes based on the input variables. In this case we only care about contiguous block groups that look similar based on their white population. We will aggregate the block groups until each new aggregate meets some minimum cv threshold. Initially I set it to 10%, so it could go higher in later iterations.

The aggregation algorithm created 230 new geography units. Notably, that is a better resolution than the 137 census tracts. We also are guaranteed some level of homogeneity, which is not necessarily true for tracts. Below are the new aggregations with the census tract boundaries overlayed. Fill color indicates the assigned region.

colors <- randomcoloR::distinctColorPalette(k = 230)

pal <- colorFactor(colors, domain = as.factor(aggregate$rids))

leaflet(aggregate %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~pal(as.character(rids)),
  weight = 2,
  opacity = 1,
  color = NA,
  fillOpacity = 0.7,
  stroke = FALSE) %>%
  addPolygons(data = tract %>% st_transform(4326),
              color = "black",
              weight = 2,
              fillColor = NA,
              fillOpacity = 0)

We see considerable improvement in the margins of error, but there are some that still fall above the 10% threshold so I can experiment more with other variables and thresholds. About 53% are still above the 0.12 mark, but the extremes are much closer to the desired threshold.

aggregate_rids <- aggregate %>%
  select(-ALAND, -AWATER) %>%
  left_join(percent_white_bg %>% st_set_geometry(NULL)) %>%
  group_by(rids) %>%
  summarise(sumest = sum(estimate_not_hispanic_or_latino_white_alone),
            moeest = tidycensus::moe_sum(margin_of_error_not_hispanic_or_latino_white_alone, estimate = estimate_not_hispanic_or_latino_white_alone)) %>%
  filter(rids != -999) %>%
  mutate(cv = (moeest / 1.645)/sumest)

#sum(aggregate_rids$cv > 0.12, na.rm = TRUE) /nrow(aggregate_rids)

ggplot(aggregate_rids, aes(x = cv)) +
  geom_histogram(fill = "#688da6", color = "white") +
  scale_x_continuous(breaks = seq(0, 1, .1)) +
  theme_minimal() +
  labs(y = "Aggregates/jolly tracts", x = "Coefficient of variation (total white population)")

pal <- colorQuantile("PuBu", domain = percent_white_bg$cv, na.color = "#edf0f2", n = 5)

leaflet(aggregate_rids %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~pal(cv),
  weight = 2,
  opacity = 1,
  color = "white",
  fillOpacity = 0.7) %>%
  addLegend(position = "bottomright",
            pal = pal,
            values = ~cv,
            title = "Aggregated coefficient of variation")

Full county example

Seven county metro area: Percent White, Under 18, Over 65, Housing Units since 2000

# packages and data

library(tigris)
library(sf)
library(tidyverse)
library(randomcoloR)
library(leaflet)

# take out only met council region

counties_fips <- c("003", "053", "123", "163", "019", "139", "037")

# Ramsey county block groups

bg <- st_read("code/output/shp/tl_2017_27_bg.shp") %>%
  st_transform(26915) %>%
  filter(COUNTYFP %in% counties_fips)

# True census tracts

tract <- st_read("code/output/shp/tl_2017_27_tract.shp") %>%
  st_transform(26915) %>%
  filter(COUNTYFP %in% counties_fips)

# Jolly tracts

aggregate <- st_read("code/output/metc7cnty_4var.shp") %>%
  st_transform(26915)

As an initial picture, there are 2,085 block groups in the seven county region.

leaflet(bg %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = "#faf7cf",
              weight = 2,
              opacity = 0.8,
              color = "#b0b5bf",
              fillOpacity = 0.6)

And 704 census tracts.

leaflet(tract %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = "#e89f9b",
              weight = 2,
              opacity = 0.8,
              color = "#b0b5bf",
              fillOpacity = 0.6)

We want to be able to subdivide the region into some smaller geographies that are contextually meaningful.

The variables I will use are percent white to illustrate demographics, over 65 and under 18 to illustrate family structure, and housing units built since 2000 to illustrate the housing stock. These are all going to be counted as proportions in the algorithm, rather than raw counts.

By mapping the covariates individually, we can see some patterns.

demographics <- read_csv("code/metc_data/target/target_estimates_metc.csv") %>% left_join(read_csv("code/metc_data/target/target_moe_metc.csv"))

demographics_bg <- bg %>% left_join(demographics %>% mutate(id = as.character(id)), by = c("GEOID" = "id")) %>%
  mutate(percent_white = totalwhite/totalppl)

pal <- colorQuantile("RdPu", domain = demographics_bg$percent_white, n = 5)

leaflet(demographics_bg %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~pal(percent_white),
  weight = 2,
  opacity = 1,
  color = "white",
  fillOpacity = 0.7) %>%
  addLegend(position = "bottomright",
            pal = pal,
            values = ~percent_white,
            title = "Percent white population")
demographics_bg$percent_over_65 <- demographics_bg$over65 / demographics_bg$total65

pal <- colorQuantile("OrRd", domain = demographics_bg$percent_over_65, n = 5)

leaflet(demographics_bg %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~pal(percent_over_65),
  weight = 2,
  opacity = 1,
  color = "white",
  fillOpacity = 0.7) %>%
  addLegend(position = "bottomright",
            pal = pal,
            values = ~percent_over_65,
            title = "Percent over 65 years old")
demographics_bg$hu_since_2000 <- demographics_bg$built_after_2000 / demographics_bg$totalhu

pal <- colorQuantile("PuRd", domain = demographics_bg$hu_since_2000, n = 4)

leaflet(demographics_bg %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~pal(hu_since_2000),
  weight = 2,
  opacity = 1,
  color = "white",
  fillOpacity = 0.7) %>%
  addLegend(position = "bottomright",
            pal = pal,
            values = ~hu_since_2000,
            title = "Housing stock built since 2000")

The regionalization algorithm generated 167 aggregate regions. This is lower than I had expected.

colors <- randomcoloR::distinctColorPalette(k = 230)

pal <- colorFactor(colors, domain = as.factor(aggregate$rids))

leaflet(aggregate %>% st_transform(4326)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(fillColor = ~pal(as.character(rids)),
  weight = 2,
  opacity = 1,
  color = NA,
  fillOpacity = 0.7,
  stroke = FALSE) %>%
  addPolygons(data = tract %>% st_transform(4326),
              color = "black",
              weight = 2,
              fillColor = NA,
              fillOpacity = 0)
library(tidycensus)
aggregate_rids <- aggregate %>%
  select(-ALAND, -AWATER) %>%
  left_join(demographics_bg %>% st_set_geometry(NULL)) %>%
  group_by(rids) %>%
  summarise(sumest_white = sum(totalwhite),
            sumest_pop = sum(totalppl),
            sumest_under18 = sum(under18),
            sumest_over65 = sum(over65),
            sumest_hu2000 = sum(built_after_2000),
            moeest_white = tidycensus::moe_sum(totalwhite_moe, estimate = totalwhite),
            moeest_pop = moe_sum(totalppl_moe, estimate = totalppl),
            moeest_under18 = moe_sum(under18_moe, estimate = under18),
            moeest_over65 = moe_sum(over65_moe, estimate = over65),
            moeest_hu2000 = moe_sum(built_after_2000_moe, estimate = built_after_2000)) %>%
  filter(rids != -999) 

(Tip: To better see the Jolly tract boundaries, select that layer last.)

pal_hu <- colorQuantile("PuRd", domain = demographics_bg$hu_since_2000, n = 4)
pal_65 <- colorQuantile("OrRd", domain = demographics_bg$percent_over_65, n = 5)
pal_white <- colorQuantile("RdPu", domain = demographics_bg$percent_white, n = 5)

leaflet() %>%
  # Base groups
  addTiles(group = "OSM (default)") %>%
  addProviderTiles(providers$CartoDB.Positron, group = "Carto basemap") %>%
  addProviderTiles(providers$OpenStreetMap.BlackAndWhite, group = "OSM black and white") %>%
  # Overlay groups
  addPolygons(data = demographics_bg %>% st_transform(4326), 
              fillColor = ~pal_hu(hu_since_2000),
  weight = 2,
  opacity = 1,
  color = NA,
  stroke = FALSE,
  fillOpacity = 0.7,
  group = "Housing stock since 2000") %>%
  addPolygons(data = demographics_bg %>% st_transform(4326), 
              fillColor = ~pal_65(percent_over_65),
  weight = 2,
  opacity = 1,
  color = NA,
  stroke = FALSE,
  fillOpacity = 0.7,
  group = "Population over 65") %>%
  addPolygons(data = demographics_bg %>% st_transform(4326), 
              fillColor = ~pal_white(percent_white),
  weight = 2,
  opacity = 1,
  color = NA,
  stroke = FALSE,
  fillOpacity = 0.7,
  group = "Percent white") %>%
  addPolygons(data = aggregate_rids %>% st_transform(4326),
              color = "black",
              weight = 2,
              fillColor = NA,
              fillOpacity = 0,
              group = "Jolly tract boundary") %>%
  # Layers control
  addLayersControl(
    baseGroups = c("OSM (default)", "Carto basemap", "OSM black and white"),
    overlayGroups = c("Housing stock since 2000", "Population over 65", "Percent white", "Jolly tract boundary"),
    options = layersControlOptions(collapsed = FALSE)
  )